library(tidyverse)
library(dplyr)
library(maps)
library(spBayes)
library(viridis)
########
# Datasets description
# PA timestamp is recorded in GMT
# EPA timestamp is recorded in both GMT and EST
# However, the time difference between GMT and EST is always the same (5 hr)
# Which means that EPA timestamp may not account for the daytime light saving
# The latest EPA update is 08/31/2020
setwd("/Users/hongjianyang/Research/PA/")
source('Code/Tools/myFunc.R')
###########################################################################
###########################################################################
###########################################################################
# 2020 Data
# Explore variance between 07/01/2020 - 07/07/2020
# 29 epa stations, 89 purple air stations (07/01)
###########################################################################
###########################################################################
###########################################################################
pa2020 <- read.csv('Data/NC_PA_FRM_PM/PA_2020_hourly_PM_NC.csv')
epa2020 <- read.csv('Data/NC_PA_FRM_PM/FRM_2020_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2020 %>%
select(col) %>%
rename(PM25 = PM25_corrected) %>%
group_by(Lon, Lat, Timestamp) %>%
summarise(PM25 = mean(PM25))
pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]
# Process EPA data
epa2020$Timestamp <- paste(epa2020$Date_GMT, epa2020$Hour_GMT)
epa2020$Timestamp <- paste(epa2020$Timestamp, ':00', sep = '')
col <- c('Lon', 'Lat', 'Timestamp', 'PM25')
epa <- epa2020 %>%
select(col)
epa <- epa[order(epa$Lon, epa$Lat, epa$Timestamp), ]
# Get 07/01/2020 - 07/07/2020 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
epa$Timestamp <- as.POSIXct(epa$Timestamp, format = '%Y-%m-%d %H:%M:%OS')
startTime <- as.POSIXct('2020-07-01 00:00:00')
endTime <- as.POSIXct('2020-07-07 23:59:59')
pa <- subset(pa, Timestamp <= endTime & Timestamp >= startTime)
epa <- subset(epa, Timestamp <= endTime & Timestamp >= startTime)
start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2020-07-07 23:00:00'))
diff <- end - start
epa$indicator = 1
pa$indicator = 0
df_combine <- rbind(epa, pa)
# Construct NC prediction locations
s01 <- seq(-90,-65,0.1) # Lon
s02 <- seq(30,38,0.1) # Lat
s0 <- as.matrix(expand.grid(s01,s02))
innc <- map.where(map('state', 'north carolina', fill = TRUE), s0[,1], s0[,2])

s0 <- s0[!is.na(innc),]
# X0 is the prediction covariates
interim <- s0
interim <- dfColNorm(interim)
X0 <- locOpe(interim[, 1], interim[, 2])
X0 <- cbind(X0, 0)
n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)
n.samples <- 25000
burn <- 5000
# Model: Y = \beta_{0} + \beta_{1}*Lon + \beta_{2}*Lat + \beta_{3}*Lon^2 +
# \beta_{4}*Lat^2 + \beta_{5}*LonLat + \beta_{6}I(epa)
for (i in 0:(n_timestamp - 1)) {
current <- as.POSIXct(3600 * i, origin = startTime)
subdf <- subset(df_combine, Timestamp == current)
S <- data.matrix(subdf[, 1:2])
maxd <- max(dist(S))
# Construct covarites
interim <- S
interim[, 1] <- (interim[, 1] - mean(interim[, 1])) / sd(interim[, 1])
interim[, 2] <- (interim[, 2] - mean(interim[, 2])) / sd(interim[, 2])
X <- locOpe(interim[, 1], interim[, 2])
X <- cbind(X, subdf$indicator)
# Kriging
Y <- subdf$PM25
starting <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
tuning <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
amcmc <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
priors <- list("beta.Norm"=list(rep(0,7), 100*diag(7)),
"phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
"sigma.sq.IG"=c(2, 1),
"tau.sq.IG"=c(2, 1))
fit_spbayes <- spLM(Y~X-1, coords=S,
starting=starting, tuning=tuning,
priors=priors, cov.model="exponential",
amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0,
start=burn, thin=10, verbose=FALSE)
pred <- pred$p.y.predictive.samples
Yhat <- apply(pred,1,mean)
Ysd <- apply(pred,1,sd)
df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
# Samples near RTP area
# Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
& df$lat <= 35.92)
rtp_std <- mean(rtp_sample$Y)
one_var[i + 1] = rtp_std
# Plot standard deviation
spa <- sum(subdf$indicator == 0)
pred_var <- ggplot(df, aes(long, lat)) +
geom_polygon(aes(x=long, y=lat)) +
geom_raster(aes(fill = Y)) +
geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
scale_fill_gradientn(colours = viridis(10))+
xlab(paste(current, 'pa:' ,spa))+ylab("")+labs(title="Posterior predictive standard deviation")+
coord_fixed()
plot(pred_var)
}








































































































































































map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) +
geom_polygon(aes(x=long, y=lat, group = group)) +
geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
coord_fixed()
R

one_var
## [1] 1.9048368 1.5832198 1.5200750 1.6563294 1.4398364 1.5934195
## [7] 1.4973053 1.3427967 1.4712567 1.3217118 1.4989728 5.0032899
## [13] 0.9792305 1.3363501 3.0152614 1.9947599 1.4174218 1.2175833
## [19] 2.3087290 1.2261241 1.4834114 3.5019689 1.7703452 1.8094301
## [25] 1.5860019 1.6026113 1.7426921 1.5841446 1.7294246 1.7342763
## [31] 1.3831059 1.3724256 1.6106443 1.1407127 1.2730400 1.1259907
## [37] 1.2192255 1.5158505 1.5773947 1.0800112 0.8631125 1.0146638
## [43] 1.3478049 1.3111975 1.2398468 1.3112580 1.4580665 1.3840458
## [49] 1.6054990 3.5155082 2.7428236 3.2419055 2.6611634 3.0540734
## [55] 2.8503498 2.7266901 2.9118467 2.0380908 1.9859960 1.8466596
## [61] 2.0427892 1.7610580 1.6490430 1.4626731 0.9646056 1.8512729
## [67] 1.3290973 2.1195550 1.4925464 1.3499480 14.6531890 9.2243556
## [73] 1.9231597 3.2265338 3.5711976 4.2417595 4.4170137 5.3389522
## [79] 4.0870788 3.7464255 3.0644302 2.9690865 2.9395986 2.3930836
## [85] 2.4108763 2.5963960 2.3664499 2.6909745 2.3122420 1.8210420
## [91] 2.8446473 1.8336552 2.3167879 2.1936964 2.7104716 3.0800347
## [97] 2.8019361 16.4045181 25.0712051 19.9969423 16.7350831 16.7473956
## [103] 12.0809767 11.8051846 9.1699250 6.5857246 4.5789852 3.6883366
## [109] 2.9584011 2.5291041 2.4177612 2.5924377 2.1150160 1.8972906
## [115] 2.7303757 2.5666619 1.7726310 1.8439864 4.3175927 2.3918483
## [121] 1.9799893 3.8631553 2.5073974 1.9745292 2.0344041 1.8943319
## [127] 1.8298457 2.0460701 1.7995480 2.3591586 1.7315899 2.4513072
## [133] 1.7956905 1.7805624 2.2439236 1.1747335 1.9654415 1.7894427
## [139] 4.2895419 1.1891301 1.5994178 1.3230480 2.1357277 2.1292956
## [145] 1.5660845 1.8231866 2.1490869 1.5563236 2.1251874 1.5657276
## [151] 1.4879444 1.6608017 1.7666723 1.6862213 2.1901775 2.2921440
## [157] 2.0486098 1.7476744 1.6288742 1.6411623 1.2071630 1.6836439
## [163] 1.3765771 3.4007835 1.2159221 1.2555763 0.7956735 0.8614610
for (i in 1:7) {
plot(one_var[(24*(i-1)+1):((24*i))], type = 'b')
}







plot(one_var, type = 'b')
